24-nov
- add dbscan to basic clustering ...
Train and adjust parameters
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter('ignore',DeprecationWarning)
import seaborn as sns
import time
import copy
from pylab import rcParams
#import hdbscan
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn import metrics
from sklearn import metrics as mt
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import confusion_matrix as conf
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.cluster import KMeans
from tabulate import tabulate
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from __future__ import print_function
data_dir = '../data/'
data_file = 'mashable_clean_dataset_for_lab_02_task_02.csv'
file_2_read = data_dir + data_file
df = pd.read_csv(file_2_read)
df_cluster = copy.deepcopy(df)
del df_cluster['data_channel']
for column in ['LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04']:
new_col_name = 'ln_' + column
print (new_col_name)
df_cluster[new_col_name] = np.log(df_cluster[column]+1)
col_names = df_cluster.columns.values.tolist()
col_names
df_cluster.describe().T
from matplotlib import pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
X1 = df_cluster[['ln_LDA_00','ln_LDA_01', 'ln_LDA_02', 'ln_LDA_03', 'ln_LDA_04']].values
plt.figure(figsize = (12,12))
plt.subplot(221)
plt.scatter(X1[:, 1], X1[:, 0],
s = 20,
alpha = 0.10)
plt.xlabel('LDA_00'), plt.ylabel('LDA_01')
plt.grid()
plt.title('LDA_00 vs. LDA_01')
plt.subplot(222)
plt.scatter(X1[:, 2], X1[:, 0],
s = 20,
alpha = 0.10)
plt.xlabel('LDA_00'), plt.ylabel('LDA_02')
plt.grid()
plt.title('LDA_00 vs. LDA_01')
plt.subplot(223)
plt.scatter(X1[:, 3], X1[:, 0],
s = 20,
alpha = 0.10)
plt.xlabel('LDA_00'), plt.ylabel('LDA_03')
plt.grid()
plt.title('LDA_00 vs. LDA_01')
plt.subplot(224)
plt.scatter(X1[:, 4], X1[:, 0],
s = 20,
alpha = 0.01)
plt.xlabel('LDA_00'), plt.ylabel('LDA_04')
plt.grid()
plt.title('LDA_00 vs. LDA_01')
plt.show();
# set required variables for model comparison
comparison_tbl = pd.DataFrame(columns = [
'model_name',
'n_clusters',
'inertia',
'silhouette',
'process_time'])
i_index = []
i_index = 0
# preparation for cross validation and model comparison, each classifier is appended once model is fit
models = []
for n_lda in range(2, 12):
tic = time.clock()
print ("n_lda = ", n_lda)
X1 = df_cluster[['ln_LDA_00','ln_LDA_01', 'ln_LDA_02', 'ln_LDA_03', 'ln_LDA_04']]
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1);
cls_lda.fit(X1)
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("inertia = ", kmeans_inertia)
kmeans_silhouette = metrics.silhouette_score(X1,
kmeans_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", kmeans_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'KMeans - LDA features',
'n_clusters' : n_lda,
'inertia': kmeans_inertia,
'silhouette': kmeans_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
comparison_tbl = comparison_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - make some plots of clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(12, 12));
plt.subplot(221);
X1 = X1.values;
plt.scatter(X1[:, 0], X1[:, 1],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.text(0.8, 0.8,
kmeans_inertia)
plt.xlabel('LDA_00'), plt.ylabel('LDA_01');
plt.grid();
plt.subplot(222);
plt.scatter(X1[:, 0], X1[:, 2],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 2],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('LDA_00'), plt.ylabel('LDA_02');
plt.grid();
plt.subplot(223);
plt.scatter(X1[:, 0], X1[:, 3],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 3],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('LDA_00'), plt.ylabel('LDA_03');
plt.grid();
plt.subplot(224);
plt.scatter(X1[:, 0], X1[:, 4],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 4],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('LDA_00'), plt.ylabel('LDA_04');
plt.grid();
plt.show();
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - print out comparison table
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
comparison_tbl
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - plot metrics across models for comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(16, 6));
# ... silhouette values
plt.subplot(131);
plt.scatter(comparison_tbl['n_clusters'],
comparison_tbl['silhouette'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(comparison_tbl['n_clusters'],
comparison_tbl['silhouette'])
plt.xlabel('n_clusters'), plt.ylabel('silhouette');
plt.grid();
# ... inertia values
plt.subplot(132);
plt.scatter(comparison_tbl['n_clusters'],
comparison_tbl['inertia'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(comparison_tbl['n_clusters'],
comparison_tbl['inertia'])
plt.xlabel('n_clusters'), plt.ylabel('inertia');
plt.grid();
# ... process time
plt.subplot(133);
plt.scatter(comparison_tbl['n_clusters'],
comparison_tbl['process_time'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
#plt.plot(comparison_tbl['n_clusters'],
# comparison_tbl['process_time'])
plt.xlabel('n_clusters'), plt.ylabel('process_time');
plt.grid();
plt.show();
for n_lda in range(2, 10):
tic = time.clock()
X1 = df_cluster[['ln_num_imgs','ln_num_videos', 'ln_num_hrefs']]
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1)
cls_lda.fit(X1)
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("n_lda = ", n_lda)
print ("inertia = ", kmeans_inertia)
kmeans_silhouette = metrics.silhouette_score(X1,
kmeans_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", kmeans_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'KMeans - images_videos_hrefs features',
'n_clusters' : n_lda,
'inertia': kmeans_inertia,
'silhouette': kmeans_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
comparison_tbl = comparison_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - make some plots of clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize = (16, 6));
plt.subplot(131);
X1 = X1.values;
plt.scatter(X1[:, 0], X1[:, 1],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.text(0.8, 0.8,
kmeans_inertia)
plt.xlabel('images'), plt.ylabel('videos');
plt.grid();
plt.subplot(132);
plt.scatter(X1[:, 0], X1[:, 2],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 2],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('images'), plt.ylabel('hrefs');
plt.grid();
plt.subplot(133);
plt.scatter(X1[:, 1], X1[:, 2],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 1], kmeans_centers[:, 2],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('videos'), plt.ylabel('hrefs');
plt.grid();
plt.show();
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - print out comparison table
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
comparison_tbl
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - plot metrics across models for comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(16, 6));
# ... silhouette values
plt.subplot(131);
plt.scatter(comparison_tbl['n_clusters'],
comparison_tbl['silhouette'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(comparison_tbl['n_clusters'],
comparison_tbl['silhouette'])
plt.xlabel('n_clusters'), plt.ylabel('silhouette');
plt.grid();
# ... inertia values
plt.subplot(132);
plt.scatter(comparison_tbl['n_clusters'],
comparison_tbl['inertia'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(comparison_tbl['n_clusters'],
comparison_tbl['inertia'])
plt.xlabel('n_clusters'), plt.ylabel('inertia');
plt.grid();
# ... process time
plt.subplot(133);
plt.scatter(comparison_tbl['n_clusters'],
comparison_tbl['process_time'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
#plt.plot(comparison_tbl['n_clusters'],
# comparison_tbl['process_time'])
plt.xlabel('n_clusters'), plt.ylabel('process_time');
plt.grid();
plt.show();
X1 = df_cluster
for n_lda in range(2, 12):
tic = time.clock()
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1);
cls_lda.fit(X1);
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("n_lda, inertia ", n_lda, kmeans_inertia)
kmeans_silhouette = metrics.silhouette_score(X1,
kmeans_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", kmeans_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'KMeans - all_in',
'n_clusters' : n_lda,
'inertia': kmeans_inertia,
'silhouette': kmeans_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
comparison_tbl = comparison_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - make some plots of clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - plot metrics across models for comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(16, 6));
# ... silhouette values
plt.subplot(131);
plt.scatter(comparison_tbl['n_clusters'],
comparison_tbl['silhouette'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(comparison_tbl['n_clusters'],
comparison_tbl['silhouette'])
plt.xlabel('n_clusters'), plt.ylabel('silhouette');
plt.grid();
# ... inertia values
plt.subplot(132);
plt.scatter(comparison_tbl['n_clusters'],
comparison_tbl['inertia'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(comparison_tbl['n_clusters'],
comparison_tbl['inertia'])
plt.xlabel('n_clusters'), plt.ylabel('inertia');
plt.grid();
# ... process time
plt.subplot(133);
plt.scatter(comparison_tbl['n_clusters'],
comparison_tbl['process_time'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
#plt.plot(comparison_tbl['n_clusters'],
# comparison_tbl['process_time'])
plt.xlabel('n_clusters'), plt.ylabel('process_time');
plt.grid();
plt.show();
http://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html
DBSCAN is a density based algorithm – it assumes clusters for dense regions. It is also the first actual clustering algorithm we’ve looked at: it doesn’t require that every point be assigned to a cluster and hence doesn’t partition the data, but instead extracts the ‘dense’ clusters and leaves sparse background classified as ‘noise’.
In practice DBSCAN is related to agglomerative clustering.
As a first step DBSCAN transforms the space according to the density of the data: points in dense regions are left alone, while points in sparse regions are moved further away. Applying single linkage clustering to the transformed space results in a dendrogram, which we cut according to a distance parameter (called epsilon or eps in many implementations) to get clusters. Importantly any singleton clusters at that cut level are deemed to be ‘noise’ and left unclustered. This provides several advantages: we get the manifold following behaviour of agglomerative clustering, and we get actual clustering as opposed to partitioning. Better yet, since we can frame the algorithm in terms of local region queries we can use various tricks such as kdtrees to get exceptionally good performance and scale to dataset sizes that are otherwise unapproachable with algorithms other than K-Means.
There are some catches however. Obviously epsilon can be hard to pick; you can do some data analysis and get a good guess, but the algorithm can be quite sensitive to the choice of the parameter. The density based transformation depends on another parameter (min_samples in sklearn).
Finally the combination of min_samples and eps amounts to a choice of density and the clustering only finds clusters at or above that density; if your data has variable density clusters then DBSCAN is either going to miss them, split them up, or lump some of them together depending on your parameter choices.
So, in summary:
So how does it cluster our test dataset? I played with a few epsilon values until I got somethign reasonable, but there was little science to this – getting the parameters right can be hard.
# set required variables for model comparison
dbscan_tbl = pd.DataFrame(columns = [
'model_name',
'n_clusters',
'epsilon',
'min_points',
'inertia',
'silhouette',
'process_time'])
i_index = []
i_index = 0
# preparation for cross validation and model comparison, each classifier is appended once model is fit
models = []
%%time
from sklearn.cluster import DBSCAN
params = []
for epsilon in [0.020, 0.03, 0.05, 0.06, 0.07]:
for min_pts in range (10, 200, 20):
tic = time.clock()
X1 = df_cluster[['ln_LDA_00','ln_LDA_01', 'ln_LDA_02', 'ln_LDA_03', 'ln_LDA_04']]
# append on the clustering
cls_fam = DBSCAN(eps = epsilon,
min_samples = min_pts,
n_jobs = -1)
cls_fam.fit(X1)
dbscan_labels = cls_fam.labels_ # the labels from kmeans clustering
dbscan_nclusters = len(set(dbscan_labels))
print ("eps, min_pts, nclusters = ", epsilon, min_pts, dbscan_nclusters)
dbscan_silhouette = metrics.silhouette_score(X1,
dbscan_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", dbscan_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'DBScan - LDA features',
'n_clusters' : dbscan_nclusters,
'epsilon' : epsilon,
'min_points' : min_pts,
'inertia': 0,
'silhouette': dbscan_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'epsilon', 'min_points', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
dbscan_tbl = dbscan_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - make some plots of clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(12, 12))
plt.subplot(221)
X1 = X1.values
plt.scatter(X1[:, 0], X1[:, 1],
c = dbscan_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_01')
plt.grid()
plt.subplot(222)
plt.scatter(X1[:, 0], X1[:, 2],
c = dbscan_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_02')
plt.grid()
plt.subplot(223)
plt.scatter(X1[:, 0], X1[:, 3],
c = dbscan_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_03')
plt.grid()
plt.subplot(224)
plt.scatter(X1[:, 0], X1[:, 4],
c = dbscan_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_04')
plt.grid()
plt.show();
# y = df_imputed['Survived']
# X = df_imputed[['IsMale','Pclass','Fare']]
# X = np.column_stack((X,pd.get_dummies(newfeature_fam)))
# acc = cross_val_score(clf,X,y=y,cv=cv)
# params.append((n_fare,n_fam,acc.mean()*100,acc.std()*100)) # save state
# print (eps,mpts,"Average accuracy = ", acc.mean()*100, "+-", acc.std()*100)
dbscan_tbl
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - plot metrics across models for comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(16, 6));
# ... silhouette values
plt.subplot(131);
plt.scatter(dbscan_tbl['min_points'],
dbscan_tbl['silhouette'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(dbscan_tbl['min_points'],
dbscan_tbl['silhouette'])
plt.xlabel('min_points'), plt.ylabel('silhouette');
plt.grid();
# ... inertia values
plt.subplot(132);
plt.scatter(dbscan_tbl['min_points'],
dbscan_tbl['n_clusters'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(dbscan_tbl['min_points'],
dbscan_tbl['n_clusters'])
plt.xlabel('min_points'), plt.ylabel('n_clusters');
plt.grid();
# ... process time
plt.subplot(133);
plt.scatter(dbscan_tbl['min_points'],
dbscan_tbl['process_time'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
#plt.plot(dbscan_tbl['n_clusters'],
# dbscan_tbl['process_time'])
plt.xlabel('min_points'), plt.ylabel('process_time');
plt.grid();
plt.show();
# set required variables for model comparison
spc_tbl = pd.DataFrame(columns = [
'model_name',
'n_clusters',
'inertia',
'silhouette',
'process_time'])
i_index = []
i_index = 0
# preparation for cross validation and model comparison, each classifier is appended once model is fit
models = []
from sklearn.cluster import SpectralClustering
# If a string, this may be one of
# ‘nearest_neighbors’, ‘precomputed’, ‘rbf’
# or one of the kernels supported by sklearn.metrics.pairwise_kernels
for n_clstr in range(2, 12):
tic = time.clock()
print ("n_clusters = ", n_clstr)
X1 = df_cluster[['data_channel_n',
'ln_n_tokens_content',
'ln_num_hrefs',
'ln_num_imgs',
'ln_num_videos',]]
X1 = X1.sample(frac = 0.1)
spc = SpectralClustering(n_clusters = n_clstr,
affinity = 'nearest_neighbors')
spc_labels = spc.fit_predict(X1)
spc_silhouette = metrics.silhouette_score(X1,
spc_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", spc_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'spc - LDA features',
'n_clusters' : n_clstr,
'inertia': 0,
'silhouette': spc_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
spc_tbl = spc_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - make some plots of clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(12, 12));
plt.subplot(221);
X1 = X1.values;
plt.scatter(X1[:, 0], X1[:, 1],
c = spc_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.xlabel('data_channel_n'), plt.ylabel('ln_n_tokens_content');
plt.grid();
plt.subplot(222);
plt.scatter(X1[:, 0], X1[:, 2],
c = spc_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.xlabel('data_channel_n'), plt.ylabel('ln_num_hrefs');
plt.grid();
plt.subplot(223);
plt.scatter(X1[:, 0], X1[:, 3],
c = spc_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.xlabel('data_channel_n'), plt.ylabel('ln_num_imgs');
plt.grid();
plt.subplot(224);
plt.scatter(X1[:, 0], X1[:, 4],
c = spc_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.xlabel('data_channel_n'), plt.ylabel('ln_num_videos');
plt.grid();
plt.show();
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - print out comparison table
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
spc_tbl
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - plot metrics across models for comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(16, 6));
# ... silhouette values
plt.subplot(121);
plt.scatter(spc_tbl['n_clusters'],
spc_tbl['silhouette'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(spc_tbl['n_clusters'],
spc_tbl['silhouette'])
plt.xlabel('n_clusters'), plt.ylabel('silhouette');
plt.grid();
# ... process time
plt.subplot(133);
plt.scatter(spc_tbl['n_clusters'],
spc_tbl['process_time'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
#plt.plot(spc_tbl['n_clusters'],
# spc_tbl['process_time'])
plt.xlabel('n_clusters'), plt.ylabel('process_time');
plt.grid();
plt.show();